
global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/9_prep_ind6090_ALM_regression.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {



* 8_merge_alm_prep_regression.do

/*
This do-file combines our ind6090 level patent counts with the ALM data and prepares the regression variables

In the previous steps, we built three different industry-patent concordances:
    using = adhps concordance from uspto to industry and then BEA to sector of use; baseline version
    mk/use = sector of manufcaturing 
    LZ = Lybbert and Zolas mapping via IPC4-NAICS1997


*/

global cvers _ipc4
local dtvers use mk lz
* -------------------------------------------------------------
* Merge the three patent count data, collapse to relevant period
* -------------------------------------------------------------

*import use, LZ and make files
foreach dtver of local dtvers { 
    use ${alm_data_proc}/patents_ind6090_`dtver'${cvers}.dta, clear

    *format a little
    ren ind6090* ind6090
    ds appln_year ind6090, not
    local r: di r(varlist)
    foreach v of local r { 
        ren `v' `v'_`dtver'
    }
    tempfile patdt_`dtver'
    save `patdt_`dtver'', replace
}

*combine the sets
use `patdt_use', clear
gen in_use = 1
mmerge appln_year ind6090 using `patdt_mk', unmatched(both)
gen in_mk = ( _m == 3 | _m == 2)

mmerge appln_year ind6090 using `patdt_lz', unmatched(both)
gen in_lz = ( _m == 3 | _m == 2)

drop _m
sort ind6090 appln_year 

*sum up patent counts by relevant interval and ind6090 code
drop if appln_year < 1980 | appln_year >1998 
drop appln_year
ds ind6090, not
local r: di r(varlist)

*sum patent counts to industry level
collapse  (sum) `r', by(ind6090)
* assign the counts to the end year 1998
gen year = 98
tempfile ind_patcounts_8098${cvers}
save `ind_patcounts_8098${cvers}', replace

* --------------------------------------
* Merge with ALM data
* --------------------------------------

* prep industry title
* Autor, David H. "The Skill Content of Recent Technological Change: An Empirical Exploration" Quarterly Journal of Economics,
* 118(4), November 2003, 1279-1334. Accessed October 2022. https://economics.mit.edu/people/faculty/david-h-autor/data-archive.
import excel using ${alm_data_raw}/alm/IndKey.xls, firstrow sheet("Ind7090-Ind6090 XWalk")clear
destring ind6090, replace
ren Title title
keep ind6090 title
drop if ind6090 == .
tempfile titles
save `titles', replace

* prep ALM data by removing industry codes without patent counts and keeping the period endpoints
*Autor, David H., Frank Levy, and Richard J. Murnane. 2003. "The Skill Content of Recent Technological Change:
* An Empirical Exploration." Quarterly Journal of Economics 118 (4): 1279-1334. Accessed November 2020. 
*https://economics.mit.edu/people/faculty/david-h-autor/data-archive
use ${alm_data_raw}/alm/inddot77means6098-centiles-cen60basis.dta, clear 
keep if year == 80 | year == 98
order ind6090 nipa6090 year yrtext edcat sex 

* merge industry title 
mmerge ind6090 using `titles'
drop _m

* merge patent counts and ALM's main table dataset; remove industry codes without patent counts
merge m:1 ind6090 year using `ind_patcounts_8098${cvers}'
sort ind6090 year 
by ind6090: egen tot_machinery_patents = sum(in_relevant_field_use + in_relevant_field_mk + in_relevant_field_lz)

log using ${numb_dir}/ind6090_industries_without_patents${cvers}.log, replace name(num)
preserve 
duplicates drop ind6090, force
list ind6090 if tot_machinery_patents == 0
restore
cap log close num
drop if tot_machinery_patents == 0
drop _m 

* ---------------------------------
* Prepare regressions
* ---------------------------------

* Only look at both gender and make sure we have the same both gender-industries for every year
sort ind6090 year edcat
keep if sex == 0
egen count=count(1*(sex==0)), by(ind6090)
drop if count<2

*Reorder so that we get correct 1st diffs
recode year 80=88

* Create the computer use variables by year
gen tag = (use84==. | use89==. | use93==. | use97==.)
drop if use84==. | use89==. | use93==. | use97==.
gen computeruse=10*(use97-use84)/13

/* Edcats;
- 0: all
- 1: HSD (high school degree)
- 2: HSG (highs school graduate)
- 3: SMC ()
- 4: CLG (college degree)
*/

* Create employment counts for all ed categories
egen emp_hsd = total(lswt) if edcat == 1, by(ind6090 year)
egen emp_hsg = total(lswt) if edcat == 2, by(ind6090 year)
egen emp_smc = total(lswt) if edcat == 3, by(ind6090 year)
egen emp_clg = total(lswt) if edcat == 4, by(ind6090 year)
egen emp_hsdhsg = total(lswt) if edcat == 1 | edcat == 2, by(ind6090 year)
replace emp_hsd = emp_hsd[_n+1]
replace emp_hsg = emp_hsg[_n+2]
replace emp_hsdhsg = emp_hsdhsg[_n+2]
replace emp_smc = emp_smc[_n+3]
replace emp_clg = emp_clg[_n+4]

* Create the high low skilled worker ratio as dependent variable
egen hskilled = total(lswt) if edcat == 4, by(ind6090 year)
egen lskilled = total(lswt) if edcat == 1 | edcat == 2 | edcat == 3, by(ind6090 year)
replace hskilled = hskilled[_n+4] if edcat == 0
replace lskilled = lskilled[_n+1] if edcat == 0
drop if edcat ~= 0
gen ratioHL = hskilled/lskilled 
replace ratioHL = 0 if hskilled == . 
replace ratioHL = 1 if lskilled == . 
quietly by ind6090: gen dratioHL = ratioHL-ratioHL[_n-1]

* And the log employment count differences
foreach var in lskilled hskilled emp_hsd emp_hsg emp_hsdhsg emp_smc emp_clg { 
    gen log_`var' = log(`var')
    by ind6090: gen dlog_`var' = log_`var' - log_`var'[_n-1]
}

* Dot variables scaled 0 to 10
sort ind6090 year
quietly by ind6090: gen dqmath   =mqmath-mqmath[_n-1]
quietly by ind6090: gen dqdcp    =mqdcp-mqdcp[_n-1]
quietly by ind6090: gen dqsts    =mqsts-mqsts[_n-1]
quietly by ind6090: gen dqfing   =mqfinger-mqfinger[_n-1]
quietly by ind6090: gen dqehf    =mqehf-mqehf[_n-1]

* 10 times annualized changes (of 18 years)
for var dqmath dqdcp dqsts dqfing dqehf: replace X=10*(X/(18))

* Create the patent variables (share automation), biadic and non biadic
foreach dtver of local dtvers { 
    foreach tech in auto90 auto95 pauto90 pauto95 {  
            gen sh_`tech'_bia_`dtver' = (`tech'_b_`dtver'/in_relevant_field_b_`dtver')
            replace sh_`tech'_bia_`dtver' = 0 if (in_relevant_field_b_`dtver' == 0 | `tech'_b_`dtver' == 0)
            gen sh_`tech'_`dtver' = (`tech'_`dtver' /in_relevant_field_`dtver')
            replace sh_`tech'_`dtver' = 0 if (in_relevant_field_`dtver' == 0 | `tech'_`dtver' == 0)
        }
    ren in_relevant_field_b_`dtver' in_relevant_field_bia_`dtver'
    replace in_`dtver' = 1 if in_`dtver' > 0 
}

* Rescale weights to sum to 1 in ea. year; and create  an average weight for the first difference regressions
egen totwt=sum(lswt),by(year) 
gen lswt_sh =lswt/totwt
sort ind6090 year
by ind6090: gen avwt=(lswt_sh + lswt_sh[_n-1])/2
clonevar emp_mid_weight_alm = avwt 

* properly scale employment and do emp_change in logs
by ind6090: gen emp_midpoint =(lswt + lswt[_n-1])/2
replace emp_midpoint = emp_midpoint / 1000
gen emp_initial = lswt[_n-1] / 1000
gen emp_end = lswt / 1000
clonevar emp_init_alm = emp_initial
clonevar emp_mid_alm = emp_midpoint
clonevar emp_end_alm = emp_end
gen log_emp_alm = log(emp_end)
by ind6090: gen dlog_emp_alm = log_emp_alm - log_emp_alm[_n-1]

* add manufacturing indicator and cluster-groups
egen uas_ind6090 = group(sh_auto95_use)
tostring ind6090, gen(ind_3)
gen ind_2 = substr(ind_3, 1, 2)
destring ind_2, replace
gen manuf = 0
replace manuf = 1 if inrange(ind_2, 10, 39) & length(ind_3) == 3
drop ind_2 ind_3

*Re-re-order and retain observations that have the correct endpoint
recode year 88=80
keep if year == 98

* Save
keep year title in_* computeruse uas_ind6090 manuf ind6090 *auto* in_relevant_field* emp_initial emp_mid_weight_alm emp_*_alm log_emp_alm dlog_emp_alm avwt computeruse dqmath dqdcp dqsts dqfing dqehf hskilled lskilled dlog_hskilled dlog_lskilled dlog_emp_* dratioHL
save ${final_dir}/ind6090_patents_regression_data${cvers}.dta, replace

* ---------------------------------
* Merge NBER
* ---------------------------------

use ${alm_data_proc}/nber_ind6090_198098.dta, clear
ds ind6090, not
local r: di r(varlist)
foreach v of local r { 
    ren `v' `v'_nber
}
tempfile empdt_nber
save `empdt_nber', replace

* merge into ind6090 dataset
use ${final_dir}/ind6090_patents_regression_data${cvers}.dta, clear
mmerge ind6090 using `empdt_nber', unmatched(both)
gen in_nber = ( _m == 3)
drop _m

* merge in trade measure
mmerge ind6090 using ${alm_data_proc}/trade_ind6090_198098.dta
drop _m

drop *rm6* *hsd *hsg*

sleep 2000
save ${final_dir}/ind6090_patents_regression_data${cvers}.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat